Setup

Libraries

library(ggplot2)
library(caTools)
library(plotly)
library(MASS)
library(matlib)
library(dplyr)
library(pROC)

Initialize variables

# Set random seed for reproducibility
set.seed(123)
p      <- 1                               # proportion of correlated predictors
npred  <- 3                               # number of predictors
z      <- 0.2                             # z 
n      <- 200000                           # Number of samples in the dataset
a <- 2.115                                # the effect size

Calculate sigma

scenario <- function(){
  # set up correlations
  corr0  <-  matrix(0, npred, npred)         # matrix: set up for cov matrix, 0 on diagonals
  corr0[1:npred*p, 1:npred*p] = z            # class 0 
  diag(corr0) = 0
  # covariance structures
  sigma0 <-  diag(npred)  + corr0            # matrix: cov matrix of class 0 
  return(sigma0)
}

Simulate data

Simulate data for diseased class and non-diseased class using a covariance matrix and mean vector using the MASS package. The covariance matrix is chosen such that the two predictors are correlated. The mean vector is chosen such that the diseased class has a higher mean than the non-diseased class for both predictors. The data is then combined into a single dataset.

# mean structures
mu0 <- c(rep(1,npred))       # vector: class 0 means

# covariance structures
sigma0 <- scenario()         # class 0

# Generate data for non-diseased class
data <- mvrnorm(n, mu = mu0, Sigma = sigma0) %>%
  as.data.frame()

# Rename columns to "predictor1", "predictor2" ,...
colnames(data) <- c(paste0("predictor", 1:(ncol(data))))

# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
L <-  0.4*a*data$predictor1 + 0.2*a*data$predictor2 + 0.1*a*data$predictor3 + 0.1*a*data$predictor1*data$predictor2

L <- - mean(L) + 0.4*a*data$predictor1 + 0.2*a*data$predictor2 + 0.1*a*data$predictor3 + 0.1*a*data$predictor1*data$predictor2 # 50/50 split

y <- ifelse(runif(n) < plogis(L), 1, 0)
data <- cbind(data, y)

#proportion y's in data
prop.table(table(data$y))
## 
##       0       1 
## 0.50812 0.49188

Plot distributions for Predictor 1 and Predictor 2

2D Histograms

# Plot for Predictor 1
ggplot(data, aes(x = predictor1, fill = as.factor(y))) +
  geom_histogram(alpha = 0.5, position = 'identity', bins = 30) +
  ggtitle("Distribution of Diseased and Non-Diseased Classes for Predictor 1")

# Plot for Predictor 2
ggplot(data, aes(x = predictor2, fill = as.factor(y))) +
  geom_histogram(alpha = 0.5, position = 'identity', bins = 30) +
  ggtitle("Distribution of Diseased and Non-Diseased Classes for Predictor 2")

### 3D Histogram of Predictor 1 and Predictor 2

# Calculate densities for class 1 and class 0
density1 <- with(data[data$y == 1, ], kde2d(predictor1, predictor2, n = 30))
density0 <- with(data[data$y == 0, ], kde2d(predictor1, predictor2, n = 30))

# Create the interactive 3D plot
p1 <- plot_ly(z = ~density1$z) %>% 
  add_surface(
    x = ~density1$x,
    y = ~density1$y,
    colorscale = list(c(0, "red"), list(1, "pink")),
    opacity = 0.8
  )

p2 <- plot_ly(z = ~density0$z) %>% 
  add_surface(
    x = ~density0$x,
    y = ~density0$y,
    colorscale = list(c(0, "blue"), list(1, "lightblue")),
    opacity = 0.8
  )

plot <- subplot(p1, p2, nrows = 1, shareX = TRUE, shareY = TRUE)
plot

Logistic Regression and checking AUC for linear separability

n_iter <- 1600                              # Number of iterations for logistic regression
auc_values <- numeric(n_iter)              # Vector to store AUC values
sample_ratio <- 0.15                        # Proportion of data to use in each logistic regression

# Run logistic regression and calculate AUC for n_iter iterations
for(i in 1:n_iter) {
  # Sample a subset of the data
  sample_index <- sample(1:nrow(data), sample_ratio * nrow(data))
  sample_data <- data[sample_index, ]
  
  # Run logistic regression with two predictors
  model <- glm(y ~ . + predictor1*predictor2, data = sample_data, family = binomial)
  
  # Predict on the same sample set
  predictions <- predict(model, newdata = sample_data, type = "response")
  
  # Calculate AUC
  auc <- colAUC(predictions, sample_data$y, plotROC = FALSE)
  auc_values[i] <- auc  # Directly store the AUC value
  
  # Save the model predictors and coefficients to calculate average coefficients
  if(i == 1) {
    coefficients <- model$coefficients
  } else {
    coefficients <- coefficients + model$coefficients
  }
  
}

# Calculate average AUC
average_auc <- mean(auc_values)
print(paste("Average AUC: ", round(average_auc, 4)))
## [1] "Average AUC:  0.8"
# Calculate average coefficients
average_coefficients <- coefficients / n_iter
print(paste("Average coefficients: ", round(average_coefficients, 2)))
## [1] "Average coefficients:  -1.73" "Average coefficients:  0.85" 
## [3] "Average coefficients:  0.42"  "Average coefficients:  0.21" 
## [5] "Average coefficients:  0.21"

Sampling from the population

sample_data <- function(data, size, perc_ones, seed = NULL) {
  if (!is.null(seed)) set.seed(seed)
  
  # Separate data into two subsets based on y
  data_0 <- filter(data, y == 0)
  data_1 <- filter(data, y == 1)
  
  # Calculate the number of samples required from each subset
  num_ones <- round(size * perc_ones)
  num_zeros <- size - num_ones
  
  # Sample from each subset
  sampled_0 <- sample_n(data_0, min(num_zeros, nrow(data_0)))
  sampled_1 <- sample_n(data_1, min(num_ones, nrow(data_1)))
  
  # Combine the sampled subsets
  sampled_data <- rbind(sampled_0, sampled_1)
  
  # Shuffle the rows
  sampled_data <- sampled_data[sample(nrow(sampled_data)), ]
  
  return(sampled_data)
}

# Sample from the population
sample_data_50 <- sample_data(data, 100, 0.5, seed = 28)
sample_data_20 <- sample_data(data, 100, 0.2, seed = 29)
sample_data_5 <- sample_data(data, 100, 0.05, seed = 30)

Plotting ROC curves

fit_and_evaluate <- function(original_data, times, sample_size, perc_ones, test_size = 0.2, seed = NULL) {
  if (!is.null(seed)) set.seed(seed)
  
  auc_values <- numeric(times)  # Vector to store AUC values
  roc_curves <- data.frame()    # Data frame to store ROC curves points
  
  for (i in 1:times) {
    # Sample from the original dataset
    data <- sample_data(original_data, size = sample_size, perc_ones = perc_ones, seed = seed)
    
    # Split the data
    index <- sample(1:nrow(data), size = round(nrow(data) * test_size))
    test_set <- data[index, ]
    train_set <- data[-index, ]
    
    # Fit logistic regression model
    model <- glm(y ~ ., data = train_set, family = "binomial")
    
    # Predict on test set
    predictions <- predict(model, newdata = test_set, type = "response")
    
    # Evaluate model performance using AUC
    roc_obj <- suppressMessages(roc(response = test_set$y, predictor = predictions, quiet = TRUE))
    auc_values[i] <- auc(roc_obj)
    
    # Store ROC curve points
    roc_curves <- rbind(roc_curves, data.frame(
      tpr = roc_obj$sensitivities,
      fpr = 1 - roc_obj$specificities,
      iteration = i
    ))
  }
  
  # Calculate the average AUC
  mean_auc <- mean(auc_values)
  
  # Plot all ROC curves
  roc_plot <- ggplot(roc_curves, aes(x = fpr, y = tpr, group = iteration, color = as.factor(iteration))) +
              geom_line(alpha = 0.3) +
              geom_line(data = data.frame(fpr = c(0, 1), tpr = c(0, 1)),
                        aes(x = fpr, y = tpr), linetype = "dashed", inherit.aes = FALSE) +
              theme_minimal() +
              labs(color = "Iteration") +
              ggtitle("ROC Curves from Multiple Iterations")
  
  return(list(AverageAUC = mean_auc, ROCPlot = roc_plot))
}
# Example usage
results.5.100 <- fit_and_evaluate(data, times = 40, sample_size = 100, test_size = .7, perc_ones = 0.5)
results.5.200 <- fit_and_evaluate(data, times = 40, sample_size = 200, test_size = .7, perc_ones = 0.5)
results.5.400 <- fit_and_evaluate(data, times = 40, sample_size = 400, test_size = .7, perc_ones = 0.5)

# Print average AUC
print(results.5.100$AverageAUC)
## [1] 0.7752198
print(results.5.200$AverageAUC)
## [1] 0.7805253
print(results.5.400$AverageAUC)
## [1] 0.7937877
# Plot ROC curves
print(results.5.100$ROCPlot)

print(results.5.200$ROCPlot)

print(results.5.400$ROCPlot)

# Example usage
results.20.100 <- fit_and_evaluate(data, times = 40, sample_size = 100, test_size = .7, perc_ones = 0.2)
results.20.200 <- fit_and_evaluate(data, times = 40, sample_size = 200, test_size = .7, perc_ones = 0.2)
results.20.400 <- fit_and_evaluate(data, times = 40, sample_size = 400, test_size = .7, perc_ones = 0.2)

# Print average AUC
print(results.20.100$AverageAUC)
## [1] 0.7464821
print(results.20.200$AverageAUC)
## [1] 0.7534212
print(results.20.400$AverageAUC)
## [1] 0.7882064
# Plot ROC curves
print(results.20.100$ROCPlot)

print(results.20.200$ROCPlot)

print(results.20.400$ROCPlot)

# Example usage
results.05.100 <- fit_and_evaluate(data, times = 40, sample_size = 100, test_size = .7, perc_ones = 0.05)
results.05.200 <- fit_and_evaluate(data, times = 40, sample_size = 200, test_size = .7, perc_ones = 0.05)
results.05.400 <- fit_and_evaluate(data, times = 40, sample_size = 400, test_size = .7, perc_ones = 0.05)

# Print average AUC
print(results.05.100$AverageAUC)
## [1] 0.667671
print(results.05.200$AverageAUC)
## [1] 0.702614
print(results.05.400$AverageAUC)
## [1] 0.7497505
# Plot ROC curves
print(results.05.100$ROCPlot)

print(results.05.200$ROCPlot)

print(results.05.400$ROCPlot)